Notes#

import os
from collections import defaultdict, Counter

from git import Repo
import dimcat as dc
import ms3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils import STD_LAYOUT, CADENCE_COLORS, CORPUS_COLOR_SCALE, chronological_corpus_order, color_background, get_corpus_display_name, get_repo_name, resolve_dir, value_count_df, get_repo_name, resolve_dir
CORPUS_PATH = os.environ.get('CORPUS_PATH', "~/dcml_corpora")
print(f"CORPUS_PATH: '{CORPUS_PATH}'")
CORPUS_PATH = resolve_dir(CORPUS_PATH)
CORPUS_PATH: '~/dcml_corpora'
repo = Repo(CORPUS_PATH)
notebook_repo = Repo('.', search_parent_directories=True)
print(f"Notebook repository '{get_repo_name(notebook_repo)}' @ {notebook_repo.commit().hexsha[:7]}")
print(f"Data repo '{get_repo_name(CORPUS_PATH)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3.__version__}")
Notebook repository 'data_reports' @ fc05ef0
Data repo 'dcml_corpora' @ 7b1478f
dimcat version 0.3.0.post1.dev13+ga5d37ea
ms3 version 1.2.5

Data loading#

Detected files#

dataset = dc.Dataset()
dataset.load(directory=CORPUS_PATH, parse_tsv=False)
dataset.data
No files have been parsed for analysis.
[default|all]
All corpora
-----------
View: This view is called 'default'. It
	- excludes fnames that are not contained in the metadata,
	- filters out file extensions requiring conversion (such as .xml), and
	- excludes review files and folders.

                               has   active   scores measures    notes expanded
                          metadata     view detected detected detected detected
corpus
ABC                            yes  default       70       70       70       70
beethoven_piano_sonatas        yes  default       87       87       87       64
chopin_mazurkas                yes  default       55       55       55       55
corelli                        yes  default      149      149      149      149
debussy_suite_bergamasque      yes  default        4        4        4        4
dvorak_silhouettes             yes  default       12       12       12       12
grieg_lyric_pieces             yes  default       66       66       66       66
liszt_pelerinage               yes  default       19       19       19       19
medtner_tales                  yes  default       19       19       19       19
mozart_piano_sonatas           yes  default       54       54       54       54
schumann_kinderszenen          yes  default       13       13       13       13
tchaikovsky_seasons            yes  default       12       12       12       12

3838/11057 files are excluded from this view.

3759 files have been excluded based on their subdir.
79 files have been excluded based on their file name.


There are 1 orphans that could not be attributed to any of the respective corpus's fnames.

Filtering#

annotated_view = dataset.data.get_view('annotated')
annotated_view.include('facets', 'measures', 'notes$', 'expanded')
annotated_view.fnames_with_incomplete_facets = False
dataset.data.set_view(annotated_view)
dataset.data
[annotated|all|default]
All corpora
-----------
View: This view is called 'annotated'. It
	- excludes fnames that are not contained in the metadata,
	- excludes pieces that do not have at least one file per selected facet,
	- filters out file extensions requiring conversion (such as .xml),
	- excludes review files and folders, and
	- includes only facets containing one of ['measures', 'notes$', 'expanded'].

                               has     active measures    notes expanded
                          metadata       view detected detected detected
corpus
ABC                            yes  annotated       70       70       70
beethoven_piano_sonatas        yes  annotated       64       64       64
chopin_mazurkas                yes  annotated       55       55       55
corelli                        yes  annotated      149      149      149
debussy_suite_bergamasque      yes  annotated        4        4        4
dvorak_silhouettes             yes  annotated       12       12       12
grieg_lyric_pieces             yes  annotated       66       66       66
liszt_pelerinage               yes  annotated       19       19       19
medtner_tales                  yes  annotated       19       19       19
mozart_piano_sonatas           yes  annotated       54       54       54
schumann_kinderszenen          yes  annotated       13       13       13
tchaikovsky_seasons            yes  annotated       12       12       12

9/12 facets are excluded from this view.
25/8264 files are excluded from this view.

25 files have been excluded based on their file name.


There are 1 orphans that could not be attributed to any of the respective corpus's fnames.
print(f"N = {dataset.data.count_pieces()} annotated pieces.")
N = 537 annotated pieces.

Metadata#

all_metadata = dataset.data.metadata()
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {dataset.data.count_pieces()} scores.")
all_metadata.reset_index(level=1).groupby(level=0).nth(0).iloc[:,:20]
Concatenated 'metadata.tsv' files cover 537 of the 537 scores.
fname TimeSig KeySig last_mc last_mn length_qb last_mc_unfolded last_mn_unfolded length_qb_unfolded volta_mcs all_notes_qb n_onsets n_onset_positions guitar_chord_count form_label_count label_count annotated_key harmony_version annotators reviewers
corpus
ABC n01op18-1_01 1: 3/4 1: -1 313 313 939.0 427.0 427.0 1281.0 NaN 3132.75 4589 1950 0 0 405 F 1.0.0 Markus Neuwirth NaN
beethoven_piano_sonatas 01-1 1: 2/2 1: -4 154 152 608.0 308.0 304.0 1216.0 NaN 1476.00 1679 985 0 0 241 f 2.3.0 Lars & Ya-Chuan (2.2.0), John Heilig (2.3.0) AN
chopin_mazurkas BI105-2op30-2 1: 3/4 1: 2 65 64 193.0 65.0 64.0 193.0 NaN 711.00 810 274 0 0 116 b 2.3.0 Wendelin Bitzan (1.0.0), Adrian Nagel (2.2.0),... JH, AN, DK
corelli op01n01a 1: 4/4 1: -1 14 14 56.0 14.0 14.0 56.0 NaN 224.00 280 110 0 0 64 F 2.3.0 Lars Opfermann, Ya-Chuan Wu (2.1.1), Hanné Bec... HB, JH
debussy_suite_bergamasque l075-01_suite_prelude 1: 4/4 1: -1 89 89 356.0 89.0 89.0 356.0 NaN 1533.67 1721 870 0 0 274 F 2.3.0 Adrian Nagel (2.1.1), Amelia Brey (2.3.0) AB, AN
dvorak_silhouettes op08n01 1: 6/8 1: 4, 7: -5, 49: 4 54 52 156.5 54.0 52.0 156.5 NaN 658.75 957 288 0 0 80 c# 2.3.0 Daniel Grote (2.1.1), Hanné Becker (2.3.0) Johannes Hentschel (2.1.1), AN
grieg_lyric_pieces op12n01 1: 2/4 1: -3 23 23 46.0 23.0 23.0 46.0 NaN 135.50 268 156 0 0 43 Eb 2.3.0 Adrian Nagel (2.1.1), John Heilig (2.30) Adrian Nagel
liszt_pelerinage 160.01_Chapelle_de_Guillaume_Tell 1: 4/4 1: 0 97 97 388.0 97.0 97.0 388.0 NaN 1902.42 2879 1069 0 0 174 C 2.3.0 Adrian Nagel (2.1.1), Amelia Brey (2.3.0) Johannes Hentschel (1-33 & 82-97), AB, AN
medtner_tales op08n01 1: 4/8 1: -3 81 81 162.0 81.0 81.0 162.0 NaN 603.00 1481 528 0 0 213 c 2.3.0 Wendelin Bitzan (2.2.0), John Heilig (2.3.0) Adrian Nagel, DK
mozart_piano_sonatas K279-1 1: 4/4 1: 0 100 100 400.0 200.0 200.0 800.0 NaN 767.00 2031 1441 0 0 251 C NaN Uli Kneisel Johannes Hentschel, Markus Neuwirth
schumann_kinderszenen n01 1: 2/4 1: 1 22 22 44.0 44.0 44.0 88.0 NaN 134.33 241 141 0 0 44 G 2.3.0 Tal Soker (2.1.1), John Heilig (2.3.0) AN, JHei, JH
tchaikovsky_seasons op37a01 1: 3/4 1: 3, 29: 1, 63: 3 103 103 309.0 103.0 103.0 309.0 NaN 1058.17 1537 829 0 0 313 A 2.3.0 Adrian Nagel (2.1.1), John Heilig (2.3.0) Johannes Hentschel, AN

Compute chronological order

chronological_order = chronological_corpus_order(all_metadata)
corpus_colors = dict(zip(chronological_order, CORPUS_COLOR_SCALE))
chronological_order
['corelli',
 'mozart_piano_sonatas',
 'beethoven_piano_sonatas',
 'ABC',
 'chopin_mazurkas',
 'schumann_kinderszenen',
 'liszt_pelerinage',
 'tchaikovsky_seasons',
 'dvorak_silhouettes',
 'grieg_lyric_pieces',
 'debussy_suite_bergamasque',
 'medtner_tales']
all_notes = dataset.data.get_all_parsed('notes', force=True, flat=True)
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()
851989 notes over 537 files.
mc mn quarterbeats duration_qb mc_onset mn_onset timesig staff voice duration ... nominal_duration scalar tied tpc midi name octave chord_id tremolo volta
corpus fname i
ABC n01op18-1_01 0 1 1 0 1.0 0 0 3/4 3 1 1/4 ... 1/4 1 1 -1 53 F3 3 12 NaN <NA>
1 1 1 0 1.0 0 0 3/4 4 1 1/4 ... 1/4 1 1 -1 53 F3 3 18 NaN <NA>
2 1 1 0 1.0 0 0 3/4 1 1 1/4 ... 1/4 1 1 -1 65 F4 4 0 NaN <NA>
3 1 1 0 1.0 0 0 3/4 2 1 1/4 ... 1/4 1 1 -1 65 F4 4 6 NaN <NA>
4 1 1 1 0.5 1/4 1/4 3/4 3 1 1/8 ... 1/8 1 -1 -1 53 F3 3 13 NaN <NA>

5 rows × 21 columns

def weight_notes(nl, group_col='midi', precise=True):
    summed_durations = nl.groupby(group_col).duration_qb.sum()
    shortest_duration = summed_durations[summed_durations > 0].min()
    summed_durations /= shortest_duration # normalize such that the shortest duration results in 1 occurrence
    if not precise:
        # This simple trick reduces compute time but also precision:
        # The rationale is to have the smallest value be slightly larger than 0.5 because
        # if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
        summed_durations /= 1.9999999
    return repeat_notes_according_to_weights(summed_durations)

def repeat_notes_according_to_weights(weights):
    try:
        counts = weights.round().astype(int)
    except Exception:
        return pd.Series(dtype=int)
    counts_reflecting_weights = []
    for pitch, count in counts.items():
        counts_reflecting_weights.extend([pitch]*count)
    return pd.Series(counts_reflecting_weights)

Ambitus#

corpus_names = {corp: get_corpus_display_name(corp) for corp in chronological_order}
chronological_corpus_names = list(corpus_names.values())
corpus_name_colors = {corpus_names[corp]: color for corp, color in corpus_colors.items()}
all_notes['corpus_name'] = all_notes.index.get_level_values(0).map(corpus_names)
grouped_notes = all_notes.groupby('corpus_name')
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_midi.columns = ['dataset', 'midi']
weighted_midi
dataset midi
0 Beethoven Sonatas 24
1 Beethoven Sonatas 24
2 Beethoven Sonatas 24
3 Beethoven Sonatas 24
4 Beethoven Sonatas 24
... ... ...
13639 Tchaikovsky Seasons 91
13640 Tchaikovsky Seasons 91
13641 Tchaikovsky Seasons 92
13642 Tchaikovsky Seasons 92
13643 Tchaikovsky Seasons 93

713189 rows × 2 columns

yaxis=dict(tickmode= 'array',
           tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
           ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
           gridcolor='lightgrey',
           )
fig = px.violin(weighted_midi,
                x='dataset',
                y='midi',
                color='dataset',
                box=True,
                labels=dict(
                    dataset='',
                    midi='distribution of pitches by duration'
                ),
                category_orders=dict(dataset=chronological_corpus_names),
                color_discrete_map=corpus_name_colors,
                width=1000, height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis,
                  **STD_LAYOUT,
                 showlegend=False)
fig.show()

Tonal Pitch Classes (TPC)#

weighted_tpc = pd.concat([weight_notes(nl, 'tpc') for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_tpc.columns = ['dataset', 'tpc']
weighted_tpc
dataset tpc
0 Beethoven Sonatas -10
1 Beethoven Sonatas -9
2 Beethoven Sonatas -9
3 Beethoven Sonatas -9
4 Beethoven Sonatas -9
... ... ...
27284 Tchaikovsky Seasons 14
27285 Tchaikovsky Seasons 14
27286 Tchaikovsky Seasons 14
27287 Tchaikovsky Seasons 14
27288 Tchaikovsky Seasons 15

845993 rows × 2 columns

As violin plot#

yaxis=dict(
    tickmode= 'array',
    tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
    ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
    gridcolor='lightgrey',
    zerolinecolor='lightgrey',
    zeroline=True
           )
fig = px.violin(weighted_tpc,
                x='dataset',
                y='tpc',
                color='dataset',
                box=True,
                labels=dict(
                    dataset='',
                    tpc='distribution of tonal pitch classes by duration'
                ),
                category_orders=dict(dataset=chronological_corpus_names),
                color_discrete_map=corpus_name_colors,
                width=1000,
                height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis,
                  **STD_LAYOUT,
                 showlegend=False)
fig.show()

As bar plots#

bar_data = all_notes.groupby('tpc').duration_qb.sum().reset_index()
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max()+1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(bar_data, x='tpc', y='duration_qb',
             labels=dict(tpc='Named pitch class',
                             duration_qb='Duration in quarter notes'
                            ),
             color_discrete_sequence=CORPUS_COLOR_SCALE,
             width=1000, height=300,
             )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.show()
scatter_data = all_notes.groupby(['corpus_name', 'tpc']).duration_qb.sum().reset_index()
fig = px.bar(scatter_data, x='tpc', y='duration_qb', color='corpus_name',
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_corpus_names),
                 color_discrete_map=corpus_name_colors,
                 width=1000, height=500,
                )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.show()

As scatter plots#

fig = px.scatter(scatter_data, x='tpc', y='duration_qb', color='corpus_name',
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_corpus_names),
                 color_discrete_map=corpus_name_colors,
                 facet_col='corpus_name', facet_col_wrap=3, facet_col_spacing=0.03,
                 width=1000, height=1000,
                )
fig.update_traces(mode='lines+markers')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**STD_LAYOUT, showlegend=False)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
    ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"], visible=True, )
fig.update_yaxes(gridcolor='lightgrey', zeroline=False, matches=None, showticklabels=True)
fig.show()
no_accidental = bar_data[bar_data.tpc.between(-1,5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1,5)].duration_qb.sum()
entire = no_accidental + with_accidental
f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = {no_accidental / entire}"
'Fraction of note duration without accidental of the entire durations: 382962.1904040404 / 573925.7076719577 = 0.6672678802931973'

Notes and staves#

print("Distribution of notes over staves:")
value_count_df(all_notes.staff)
Distribution of notes over staves:
counts %
staff
1 375411 0.440629
2 329671 0.386943
3 79538 0.093356
4 67369 0.079073